library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
options(
tigris_class = "sf",
tigris_use_cache = TRUE
)
Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")
Here I obtain various demographic data, including income (percent below 50% and 80% of area median income), vehicle ownership, age, English language ability, and occupants per room.
# obtain the saved census data
setwd("~/Documents/2020 Spring Quarter/CEE 218Z")
acs_vars = readRDS("censusData2018_acs_acs5.rds")
setwd("~/Documents/2020 Spring Quarter/CEE 218Z/covid19")
# load in income data - code adapted from other students
sj_median_income_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "B19013_001E"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
rename(
Median_Income = B19013_001E
) %>%
filter(!is.na(Median_Income)) %>%
left_join(sj_blockgroups, by = c("blockgroup" = "GEOID")) %>% #this code gives each blockgroup a district designation
filter(
!is.na(DISTRICTS)
) %>%
# this code joins our census data with the social distancing data, processed as shown below
left_join(sj_socialdistancing %>%
filter(date > max(date)-4 & date < max(date)) %>% # note I adjust this to avoid including a weekend
group_by(origin_census_block_group) %>%
summarize(
completely_home_device_count = sum(completely_home_device_count),
device_count = sum(device_count)) %>%
mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1)),
by = c("blockgroup" = "origin_census_block_group")
) %>%
filter(
!is.na(device_count)
)
sj_ami_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B19001)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
group_by(blockgroup) %>%
summarize(
Total = B19001_001E,
`Under 75,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E),
#sum(lapply(2:12, function(x) as.name(paste0("B19001_00",x,"E"))))
`Under 100,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E)
) %>%
mutate(
`% under 75,000` = `Under 75,000` / Total * 100,
`% under 100,000` = `Under 100,000` / Total * 100
) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
) %>%
filter(!is.na(device_count))
# loading in language data - code adapted from other students
sj_lang_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B16004)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(
key = "variable",
value = "estimate",
- blockgroup
) %>%
left_join(acs_vars, by = c("variable" = "name")) %>%
mutate(
tier = substr(label,lapply(label, function(x) max(unlist(gregexpr('!!',x)))+2),nchar(label))
) %>%
filter(tier %in% c('Speak English "not well"',
'Speak English "not at all"',
'Total', 'Speak Spanish',
'Speak Asian and Pacific Island languages')) %>%
group_by(blockgroup, tier) %>%
summarise(
estimate1 = sum(estimate)
) %>%
spread(
key = "tier",
value = "estimate1"
) %>%
mutate(
`% speaking english < well` = (`Speak English "not well"` + `Speak English "not at all"`) / Total * 100,
`% speaking spanish` = (`Speak Spanish`/ Total) * 100,
`% speaking api` = (`Speak Asian and Pacific Island languages` / Total) * 100
) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>%
filter(!is.na(device_count))
# %>%
# mutate(
# log_perc = log(`% speaking english < well`)
# ) %>%
# filter(log_perc > 0)
# loading in age data - specifically looking at percentage 65+ and percentage <30
sj_age_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B01001)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(
key = "variable",
value = "estimate",
- blockgroup
) %>%
mutate(
label = acs_vars$label[match(variable,acs_vars$name)]
) %>%
select(-variable) %>%
separate(
label,
into = c(NA,NA,"sex","age"),
sep = "!!"
) %>% filter(!is.na(age)) %>%
mutate(elderly = ifelse(age %in% c("65 and 66 years", "67 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over"), estimate, NA), `less than 30` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years", "18 and 19 years", "20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA)) %>%
group_by(blockgroup) %>%
summarize(elderly = sum(elderly, na.rm = T), `less than 30` = sum(`less than 30`, na.rm = T), total = sum(estimate, na.rm = T)) %>%
mutate(`percent elderly` = elderly*100 / total, `percent less than 30` = `less than 30`*100 / total) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>%
filter(!is.na(device_count))
# get data on vehicles available
sj_vehicles_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B992512)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
dplyr::select(B992512_001E, blockgroup) %>%
rename(total_vehicles = B992512_001E, blockgroup = blockgroup) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
mutate(`vehicles per capita` = total_vehicles / total) %>%
filter(!is.na(device_count))
# get data on occupants per room
sj_occupants_per_room_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B25014)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>%
filter(!is.na(`occupants per room`)) %>%
group_by(blockgroup, `occupants per room`) %>%
summarize(estimate_tot = sum(estimate)) %>%
spread(key = `occupants per room`, value = estimate_tot) %>%
mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`, `percent 1 or more` = (`1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) * 100/ total_nums) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# first graph results
# age
sj_age_by_block %>%
ggplot(aes(
x = `percent less than 30`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents younger than 30",
y = "Percent devices completely at home in past 2 days",
title = "San Jose: Social Distancing and Young Age Groups"
)
sj_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
ggplot(aes(
x = `percent elderly`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents ages 65 and older",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Elderly Population"
)
# income - less than $75000
sj_ami_by_block %>%
ggplot(aes(
x = `% under 75,000`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Households Above and Below 50% AMI"
)
# income - less than $100000
sj_ami_by_block %>%
ggplot(aes(
x = `% under 100,000`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $100,000 (80% AMI) annually",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Households Above and Below 80% AMI"
)
# language
sj_lang_by_block %>%
ggplot(aes(
x = `% speaking english < well`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals speaking English less than well",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and English Language Ability"
)
# occupants per room
sj_occupants_per_room_by_block %>%
ggplot(aes(
x = `percent 1 or more`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with more than 1 occupant per room",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Room Occupancy"
)
# vehicles
sj_vehicles_by_block %>%
ggplot(aes(
x = `vehicles per capita`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Number of vehicles per capita",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Vehicle Ownership"
)
# multiple regression - leaving out the vehicles but using the other four
modeltest <- lm(sj_ami_by_block$`% Completely at Home` ~ sj_ami_by_block$`% under 75,000` + sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english < well` + sj_occupants_per_room_by_block$`percent 1 or more`)
summary(modeltest)
##
## Call:
## lm(formula = sj_ami_by_block$`% Completely at Home` ~ sj_ami_by_block$`% under 75,000` +
## sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english < well` +
## sj_occupants_per_room_by_block$`percent 1 or more`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.377 -4.660 0.382 5.468 25.092
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 58.11055 1.84878 31.432
## sj_ami_by_block$`% under 75,000` -0.22548 0.02543 -8.866
## sj_age_by_block$`percent less than 30` -0.03203 0.04984 -0.643
## sj_lang_by_block$`% speaking english < well` 0.04676 0.05309 0.881
## sj_occupants_per_room_by_block$`percent 1 or more` -0.04907 0.05341 -0.919
## Pr(>|t|)
## (Intercept) <2e-16 ***
## sj_ami_by_block$`% under 75,000` <2e-16 ***
## sj_age_by_block$`percent less than 30` 0.521
## sj_lang_by_block$`% speaking english < well` 0.379
## sj_occupants_per_room_by_block$`percent 1 or more` 0.359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.465 on 562 degrees of freedom
## Multiple R-squared: 0.2018, Adjusted R-squared: 0.1961
## F-statistic: 35.52 on 4 and 562 DF, p-value: < 2.2e-16
Experimentation with other variables and other ways of analyzing the social distancing data. First I look at a few other possible variables.
# try getting other variables
# get data on units in structure
sj_units_in_structure_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B25024)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "units"), sep = "!!") %>%
filter(!is.na(units)) %>%
spread(key = units, value = estimate) %>%
mutate(total_nums = `1, attached` + `1, detached` + `10 to 19` + `2` + `20 to 49`+ `3 or 4` + `5 to 9`+ `50 or more`+ `Boat, RV, van, etc.`+ `Mobile home`, `percent 20 or more` = (`20 to 49`+`50 or more`)* 100/ total_nums, `percent 1 only` = (`1, attached` + `1, detached`)*100/total_nums, `percent > 1` = 100 - `percent 1 only`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# plot
sj_units_in_structure_by_block %>%
ggplot(aes(
x = `percent 20 or more`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of structures with more than 20 units",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and 20 or More Units Per Structure"
)
sj_units_in_structure_by_block %>%
ggplot(aes(
x = `percent 1 only`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of structures with only one unit",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Only 1 Unit Per Structure"
)
# load data on household type and size
sj_house_size_type_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B11016)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "type", "size"), sep = "!!") %>%
filter(!is.na(type))
# household type
sj_house_type_by_block <- sj_house_size_type_by_block %>%
filter(is.na(size)) %>%
dplyr::select(-size) %>%
spread(key = type, value = estimate) %>%
mutate(`total households` = `Family households` + `Nonfamily households`, `percent nonfamily` = `Nonfamily households` / `total households`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
sj_house_type_by_block %>%
ggplot(aes(
x = `percent nonfamily`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent nonfamily households",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Household Type"
)
# household size
sj_house_size_by_block <- sj_house_size_type_by_block %>%
filter(!is.na(size)) %>%
dplyr::select(-type) %>%
group_by(blockgroup, size) %>%
summarize(`total of this size` = sum(estimate)) %>%
spread(key = size, value = `total of this size`) %>%
mutate(total_nums = `1-person household` + `2-person household` + `3-person household` + `4-person household` + `5-person household`+ `6-person household` + `7-or-more person household`, `percent 5 or more` = (`5-person household`+`6-person household` + `7-or-more person household`)* 100/ total_nums, `percent 1 or 2 only` = (`1-person household` + `2-person household`)*100/total_nums) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
sj_house_size_by_block %>%
ggplot(aes(
x = `percent 5 or more`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with 5 or more people",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Households With 5 or More"
)
sj_house_size_by_block %>%
ggplot(aes(
x = `percent 1 or 2 only`,
y = `% Completely at Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with 1 or 2 people",
y = "Percent devices completely at home in past 3 days",
title = "San Jose: Social Distancing and Small Household Size"
)
Next I consider different ways of looking at the social distancing data. First I try distance traveled.
# try other ways of looking at the social distancing data
# first look at total distance traveled
sj_sd_distance <- sj_socialdistancing %>%
filter(date > max(date)-4 & date < max(date)) %>%
group_by(origin_census_block_group) %>%
summarize(total_dist_traveled = sum(distance_traveled_from_home), device_count = sum(device_count)) %>%
mutate(total_dist_per_device = total_dist_traveled / device_count)
sj_distance_testing <- left_join(sj_ami_by_block, sj_sd_distance, by = c("blockgroup" = "origin_census_block_group")) %>% left_join(sj_age_by_block %>% select(blockgroup, `percent less than 30`))
sj_distance_testing %>% filter(total_dist_per_device < 500) %>%
ggplot(aes(
x = `% under 75,000`,
y = total_dist_per_device
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
y = "Average distance traveled per device in past 3 days",
title = "San Jose: Social Distancing and Income, Distance Metric"
)
This is very skewed by outliers, and probably not a useful metric.
Now I consider including devices that traveled <1km as staying at (or near) home.
sj_sd_range <- sj_socialdistancing %>%
filter(date > max(date)-4 & date < max(date)) %>%
mutate(travel_buckets_split = lapply(bucketed_distance_traveled, function(x) strsplit(x, "<1000")[[1]][2]), less_than_1km = lapply(travel_buckets_split, function(x) strsplit(x, ":")[[1]][2]), less_than_1km = lapply(less_than_1km, function(x) strsplit(x, ",")[[1]][1])) %>%
mutate(less_than_1km = lapply(less_than_1km, function(x) str_remove(x, "[}]"))) %>% # clean a bit more
mutate(less_than_1km = as.numeric(less_than_1km), less_than_1km = replace_na(less_than_1km, 0)) %>%
mutate(home_or_1km = completely_home_device_count + less_than_1km) %>%
group_by(origin_census_block_group) %>%
summarize(home_or_1km = sum(home_or_1km), device_count = sum(device_count)) %>%
mutate(`% Within 1km of Home` = (home_or_1km/device_count*100) %>% round(1))
# join this with other data
sj_1km_testing <- left_join(sj_ami_by_block, sj_sd_range, by = c("blockgroup" = "origin_census_block_group")) %>%
left_join(sj_occupants_per_room_by_block %>% dplyr::select(`percent 1 or more`, blockgroup)) %>%
left_join(sj_age_by_block %>% dplyr::select(`percent less than 30`, blockgroup)) %>%
left_join(sj_lang_by_block %>% dplyr::select(`% speaking english < well`, blockgroup))
# plot with income
sj_1km_testing %>%
ggplot(aes(
x = `% under 75,000`,
y = `% Within 1km of Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
y = "Percent of devices within 1km of home in last 3 days",
title = "San Jose: Social Distancing and Income, 1km Range"
)
# plot with age
sj_1km_testing %>%
ggplot(aes(
x = `percent less than 30`,
y = `% Within 1km of Home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people younger than 30",
y = "Percent of devices within 1km of home in last 3 days",
title = "San Jose: Social Distancing and Age, 1km Range"
)
# run multiple regression model
modeltest2 <- lm(sj_1km_testing$`% Within 1km of Home` ~ sj_1km_testing$`% under 75,000` + sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english < well` + sj_1km_testing$`percent 1 or more`)
summary(modeltest2)
##
## Call:
## lm(formula = sj_1km_testing$`% Within 1km of Home` ~ sj_1km_testing$`% under 75,000` +
## sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english < well` +
## sj_1km_testing$`percent 1 or more`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.3838 -5.3019 0.8015 5.3307 25.7139
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 71.282623 1.879978 37.917
## sj_1km_testing$`% under 75,000` -0.206785 0.025862 -7.996
## sj_1km_testing$`percent less than 30` -0.154081 0.050683 -3.040
## sj_1km_testing$`% speaking english < well` -0.025237 0.053986 -0.467
## sj_1km_testing$`percent 1 or more` -0.001082 0.054308 -0.020
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## sj_1km_testing$`% under 75,000` 7.37e-15 ***
## sj_1km_testing$`percent less than 30` 0.00248 **
## sj_1km_testing$`% speaking english < well` 0.64034
## sj_1km_testing$`percent 1 or more` 0.98411
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.608 on 562 degrees of freedom
## Multiple R-squared: 0.2195, Adjusted R-squared: 0.2139
## F-statistic: 39.51 on 4 and 562 DF, p-value: < 2.2e-16
It looks like the fit of these selected variables is slightly better for the social distancing data based on not traveling farther than 1km.
Now I also consider “non-work” behavior.
sj_nonworking_by_block <- sj_socialdistancing %>%
filter(date > max(date)-4 & date < max(date)) %>%
mutate(nonworking = device_count - completely_home_device_count - part_time_work_behavior_devices - full_time_work_behavior_devices) %>%
group_by(origin_census_block_group) %>%
summarize(nonworking_count = sum(nonworking), total_device = sum(device_count)) %>%
mutate(nonworking_percent = nonworking_count*100 / total_device) %>%
left_join(sj_1km_testing %>% dplyr::select(`% under 75,000`, `percent less than 30`, `% speaking english < well`, `percent 1 or more`, blockgroup), by = c("origin_census_block_group" = "blockgroup"))
# plot against age and income
sj_nonworking_by_block %>%
ggplot(aes(
x = `% under 75,000`,
y = nonworking_percent
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
y = "Percent of devices leaving home for non-work purposes in last 3 days",
title = "San Jose: Social Distancing and Income, Nonworking Behavior"
)
sj_nonworking_by_block %>%
ggplot(aes(
x = `percent less than 30`,
y = nonworking_percent
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people younger than 30",
y = "Percent of devices leaving home for non-work purposes in last 3 days",
title = "San Jose: Social Distancing and Age, Nonworking Behavior"
)
# multiple regression model
modeltest3 <- lm(sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% under 75,000` + sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english < well` + sj_nonworking_by_block$`percent 1 or more`)
summary(modeltest3)
##
## Call:
## lm(formula = sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% under 75,000` +
## sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english < well` +
## sj_nonworking_by_block$`percent 1 or more`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.426 -5.699 -0.508 4.549 33.790
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 35.817784 1.842671 19.438
## sj_nonworking_by_block$`% under 75,000` 0.110399 0.025349 4.355
## sj_nonworking_by_block$`percent less than 30` -0.169651 0.049677 -3.415
## sj_nonworking_by_block$`% speaking english < well` -0.009425 0.052914 -0.178
## sj_nonworking_by_block$`percent 1 or more` 0.144804 0.053230 2.720
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## sj_nonworking_by_block$`% under 75,000` 1.58e-05 ***
## sj_nonworking_by_block$`percent less than 30` 0.000684 ***
## sj_nonworking_by_block$`% speaking english < well` 0.858691
## sj_nonworking_by_block$`percent 1 or more` 0.006724 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.437 on 562 degrees of freedom
## Multiple R-squared: 0.08735, Adjusted R-squared: 0.08085
## F-statistic: 13.45 on 4 and 562 DF, p-value: 1.792e-10
These variables do worse for the percent nonworking metric.